Data Exploration and Preparing for Modelling

# Load necessary libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.2
## Warning: package 'ggplot2' was built under R version 4.3.2
## Warning: package 'tidyr' was built under R version 4.3.2
## Warning: package 'dplyr' was built under R version 4.3.2
## Warning: package 'stringr' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(lubridate)
library(skimr)
library(timetk)
## Warning: package 'timetk' was built under R version 4.3.2
## 
## Attaching package: 'timetk'
## 
## The following object is masked from 'package:data.table':
## 
##     :=
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## 
## Attaching package: 'h2o'
## 
## The following objects are masked from 'package:data.table':
## 
##     hour, month, week, year
## 
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## 
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## 
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.2
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## Warning: package 'dials' was built under R version 4.3.2
## Warning: package 'infer' was built under R version 4.3.2
## Warning: package 'modeldata' was built under R version 4.3.2
## Warning: package 'parsnip' was built under R version 4.3.2
## Warning: package 'rsample' was built under R version 4.3.2
## Warning: package 'tune' was built under R version 4.3.2
## Warning: package 'workflows' was built under R version 4.3.2
## Warning: package 'workflowsets' was built under R version 4.3.2
## Warning: package 'yardstick' was built under R version 4.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ data.table::between()   masks dplyr::between()
## ✖ scales::discard()       masks purrr::discard()
## ✖ dplyr::filter()         masks stats::filter()
## ✖ data.table::first()     masks dplyr::first()
## ✖ recipes::fixed()        masks stringr::fixed()
## ✖ dplyr::lag()            masks stats::lag()
## ✖ data.table::last()      masks dplyr::last()
## ✖ yardstick::spec()       masks readr::spec()
## ✖ recipes::step()         masks stats::step()
## ✖ data.table::transpose() masks purrr::transpose()
## • Learn how to get started at https://www.tidymodels.org/start/
library(modeltime)
## Warning: package 'modeltime' was built under R version 4.3.2
## 
## Attaching package: 'modeltime'
## 
## The following object is masked from 'package:data.table':
## 
##     :=
# Check the current working directory
getwd()
## [1] "C:/Users/ACER/Desktop/R github portfolio"
# Load and explore the dataset
data <- read.csv("airpassengers.csv")

# Data Exploration
# Display the structure of the data
str(data)
## 'data.frame':    144 obs. of  2 variables:
##  $ Month       : chr  "1949-01" "1949-02" "1949-03" "1949-04" ...
##  $ X.Passengers: int  112 118 132 129 121 135 148 148 136 119 ...
# Visualize the data in the RStudio Viewer
View(data)

# Rename columns
colnames(data) <- c('Date', 'Count')

# Display the structure of the data after renaming columns
data %>% glimpse()
## Rows: 144
## Columns: 2
## $ Date  <chr> "1949-01", "1949-02", "1949-03", "1949-04", "1949-05", "1949-06"…
## $ Count <int> 112, 118, 132, 129, 121, 135, 148, 148, 136, 119, 104, 118, 115,…
# Preprocess date column
data$Date <- paste(data$Date, "-01", sep="")
data$Date <- as.Date(data$Date, format="%Y-%m-%d")

# Display the structure of the data after preprocessing date column
str(data)
## 'data.frame':    144 obs. of  2 variables:
##  $ Date : Date, format: "1949-01-01" "1949-02-01" ...
##  $ Count: int  112 118 132 129 121 135 148 148 136 119 ...
# Visualize time series data
# Assuming `interactive` is a logical variable
data %>% plot_time_series(Date, Count, .interactive = TRUE)  
# Time series splitting
splits <- initial_time_split(data, prop = 0.9)

Modelling

# Model 1: arima_boost ----
model_fit_arima_boosted <- arima_boost(
  min_n = 2,
  learn_rate = 0.015
) %>%
  set_engine(engine = "auto_arima_xgboost") %>%
  fit(Count ~ Date + as.numeric(Date) + factor(format(Date, "%m"), ordered = FALSE),
      data = training(splits))
## frequency = 12 observations per 1 year
# Model 2: prophet ----
model_fit_prophet <- prophet_reg() %>%
  set_engine(engine = "prophet") %>%
  fit(Count ~ Date, data = training(splits))
## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
# Model 3: ets ----
model_fit_ets <- exp_smoothing() %>%
  set_engine(engine = "ets") %>%
  fit(Count ~ Date, data = training(splits))
## frequency = 12 observations per 1 year

Model Evaluation

# Compare RMSE score on test data
predictions_arima_boosted <- predict(model_fit_arima_boosted, new_data = testing(splits)) %>%
  bind_cols(testing(splits))
predictions_prophet <- predict(model_fit_prophet, new_data = testing(splits)) %>%
  bind_cols(testing(splits))
predictions_ets <- predict(model_fit_ets, new_data = testing(splits)) %>%
  bind_cols(testing(splits))

rmse_arima_boosted <- sqrt(mean((predictions_arima_boosted$Count - predictions_arima_boosted$.pred)^2))
rmse_prophet <- sqrt(mean((predictions_prophet$Count - predictions_prophet$.pred)^2))
rmse_ets <- sqrt(mean((predictions_ets$Count - predictions_ets$.pred)^2))

cat("RMSE for arima_boosted:", rmse_arima_boosted, "\n")
## RMSE for arima_boosted: 19.35821
cat("RMSE for prophet:", rmse_prophet, "\n")
## RMSE for prophet: 39.10063
cat("RMSE for ets:", rmse_ets, "\n")
## RMSE for ets: 26.07049
# Model Selection and Forecasting
# Make forecast on the model with the lowest RMSE score
lowest_rmse_model <- which.min(c(rmse_arima_boosted, rmse_prophet, rmse_ets))
if (lowest_rmse_model == 1) {
  final_forecast <- predict(model_fit_arima_boosted, new_data = data) %>%
    bind_cols(data)
} else if (lowest_rmse_model == 2) {
  final_forecast <- predict(model_fit_prophet, new_data = data) %>%
    bind_cols(data)
} else {
  final_forecast <- predict(model_fit_ets, new_data = data) %>%
    bind_cols(data)
}

# Visualize past data and forecast values on one plot with different colors
ggplot() +
  geom_line(data = final_forecast, aes(x = Date, y = .pred, color = "Forecast"), linetype = "dashed") +
  geom_line(data = data, aes(x = Date, y = Count, color = "Actual")) +
  labs(title = "Past Data and Forecast Values",
       x = "Date",
       y = "Count") +
  scale_color_manual(values = c("Actual" = "blue", "Forecast" = "red"), guide = "legend") +
  theme_minimal()